import numpy as np
import pandas as pd
import plotly as py
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix
#import scikitplot as skplt
df = pd.read_csv("BankChurners.csv")
df.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 | Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 | 0.000093 | 0.99991 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 | 0.000057 | 0.99994 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 | 0.000021 | 0.99998 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | Unknown | Less than $40K | Blue | 34 | ... | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 | 0.000134 | 0.99987 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | ... | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 | 0.000022 | 0.99998 |
5 rows × 23 columns
df.shape
# 结果
(10127, 23)
(10127, 23)
# 全部字段
columns = df.columns
columns
Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',
'Dependent_count', 'Education_Level', 'Marital_Status',
'Income_Category', 'Card_Category', 'Months_on_book',
'Total_Relationship_Count', 'Months_Inactive_12_mon',
'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
dtype='object')
CLIENTNUM:Client number - Unique identifier for the customer holding the account
Attrition_Flag:Flag indicative of account closure in next 6 months (between Jan to Jun 2013)
Customer_Age:Age of the account holder
Gender:Gender of the account holder
Dependent_count:Number of people financially dependent on the account holder
Education_Level:Educational qualification of account holder (ex - high school, college grad etc.)
Marital_Status:Marital status of account holder (Single, Married, Divorced, Unknown)
Income_Category:Annual income category of the account holder
Card_Category:Card type depicting the variants of the cards by value proposition (Blue, Silver and Platinum)
Months_on_book:Number of months since the account holder opened an an account with the lender
Total_Relationship_Count:Total number of products held by the customer. Total number of relationships the account holder has with the bank (example - retail bank, mortgage, wealth management etc.)
Months_Inactive_12_mon:Total number of months inactive in last 12 months
Contacts_Count_12_mon:Number of Contacts in the last 12 months. No. of times the account holder called to the call center in the past 12 months
Credit_Limit:Credit limit
Total_Revolving_Bal:Total amount as revolving balance
Avg_Open_To_Buy:Open to Buy Credit Line (Average of last 12 months)
Total_Amt_Chng_Q4_Q1:Change in Transaction Amount (Q4 over Q1)
Total_Trans_Amt:Total Transaction Amount (Last 12 months)
Total_Trans_Ct:Total Transaction Count (Last 12 months)
Total_Ct_Chng_Q4_Q1:Change in Transaction Count (Q4 over Q1)
Avg_Utilization_Ratio:Average Card Utilization Ratio
Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1
Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactivity
df.dtypes # 字段类型; 部分截图
CLIENTNUM int64 Attrition_Flag object Customer_Age int64 Gender object Dependent_count int64 Education_Level object Marital_Status object Income_Category object Card_Category object Months_on_book int64 Total_Relationship_Count int64 Months_Inactive_12_mon int64 Contacts_Count_12_mon int64 Credit_Limit float64 Total_Revolving_Bal int64 Avg_Open_To_Buy float64 Total_Amt_Chng_Q4_Q1 float64 Total_Trans_Amt int64 Total_Trans_Ct int64 Total_Ct_Chng_Q4_Q1 float64 Avg_Utilization_Ratio float64 Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 float64 Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 float64 dtype: object
# 不同字段类型的统计
#通过下面的代码能够统计不同类型下的字段数量:
pd.value_counts(df.dtypes)
int64 10 float64 7 object 6 dtype: int64
df.describe().style.background_gradient(cmap="ocean_r") # 表格美化输出
| CLIENTNUM | Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 | Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 |
| mean | 739177606.333663 | 46.325960 | 2.346203 | 35.928409 | 3.812580 | 2.341167 | 2.455317 | 8631.953698 | 1162.814061 | 7469.139637 | 0.759941 | 4404.086304 | 64.858695 | 0.712222 | 0.274894 | 0.159997 | 0.840003 |
| std | 36903783.450231 | 8.016814 | 1.298908 | 7.986416 | 1.554408 | 1.010622 | 1.106225 | 9088.776650 | 814.987335 | 9090.685324 | 0.219207 | 3397.129254 | 23.472570 | 0.238086 | 0.275691 | 0.365301 | 0.365301 |
| min | 708082083.000000 | 26.000000 | 0.000000 | 13.000000 | 1.000000 | 0.000000 | 0.000000 | 1438.300000 | 0.000000 | 3.000000 | 0.000000 | 510.000000 | 10.000000 | 0.000000 | 0.000000 | 0.000008 | 0.000420 |
| 25% | 713036770.500000 | 41.000000 | 1.000000 | 31.000000 | 3.000000 | 2.000000 | 2.000000 | 2555.000000 | 359.000000 | 1324.500000 | 0.631000 | 2155.500000 | 45.000000 | 0.582000 | 0.023000 | 0.000099 | 0.999660 |
| 50% | 717926358.000000 | 46.000000 | 2.000000 | 36.000000 | 4.000000 | 2.000000 | 2.000000 | 4549.000000 | 1276.000000 | 3474.000000 | 0.736000 | 3899.000000 | 67.000000 | 0.702000 | 0.176000 | 0.000181 | 0.999820 |
| 75% | 773143533.000000 | 52.000000 | 3.000000 | 40.000000 | 5.000000 | 3.000000 | 3.000000 | 11067.500000 | 1784.000000 | 9859.000000 | 0.859000 | 4741.000000 | 81.000000 | 0.818000 | 0.503000 | 0.000337 | 0.999900 |
| max | 828343083.000000 | 73.000000 | 5.000000 | 56.000000 | 6.000000 | 6.000000 | 6.000000 | 34516.000000 | 2517.000000 | 34516.000000 | 3.397000 | 18484.000000 | 139.000000 | 3.714000 | 0.999000 | 0.999580 | 0.999990 |
# 缺失值比例:数据中没有缺失值
total = df.isnull().sum().sort_values(ascending=False)
Percentage = total / len(df)
# 每个字段的缺失值统计
df.isnull().sum()
CLIENTNUM 0 Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 0 Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 0 dtype: int64
no_use = np.arange(21, df.shape[1]) # 最后两个字段
no_use
array([21, 22])
# 1、删除多个字段
df.drop(df.columns[no_use], axis=1, inplace=True)
#CLIENTNUM表示的客户编号的信息,对建模无用直接删除:
# 2、删除单个字段
df.drop("CLIENTNUM", axis=1, inplace=True)
#新生成的df的字段(删除了无效字段之后):
df.columns
Index(['Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count',
'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category',
'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'],
dtype='object')
#再次查看数据的描述统计信息:
df.describe().style.background_gradient(cmap="ocean_r")
| Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 |
| mean | 46.325960 | 2.346203 | 35.928409 | 3.812580 | 2.341167 | 2.455317 | 8631.953698 | 1162.814061 | 7469.139637 | 0.759941 | 4404.086304 | 64.858695 | 0.712222 | 0.274894 |
| std | 8.016814 | 1.298908 | 7.986416 | 1.554408 | 1.010622 | 1.106225 | 9088.776650 | 814.987335 | 9090.685324 | 0.219207 | 3397.129254 | 23.472570 | 0.238086 | 0.275691 |
| min | 26.000000 | 0.000000 | 13.000000 | 1.000000 | 0.000000 | 0.000000 | 1438.300000 | 0.000000 | 3.000000 | 0.000000 | 510.000000 | 10.000000 | 0.000000 | 0.000000 |
| 25% | 41.000000 | 1.000000 | 31.000000 | 3.000000 | 2.000000 | 2.000000 | 2555.000000 | 359.000000 | 1324.500000 | 0.631000 | 2155.500000 | 45.000000 | 0.582000 | 0.023000 |
| 50% | 46.000000 | 2.000000 | 36.000000 | 4.000000 | 2.000000 | 2.000000 | 4549.000000 | 1276.000000 | 3474.000000 | 0.736000 | 3899.000000 | 67.000000 | 0.702000 | 0.176000 |
| 75% | 52.000000 | 3.000000 | 40.000000 | 5.000000 | 3.000000 | 3.000000 | 11067.500000 | 1784.000000 | 9859.000000 | 0.859000 | 4741.000000 | 81.000000 | 0.818000 | 0.503000 |
| max | 73.000000 | 5.000000 | 56.000000 | 6.000000 | 6.000000 | 6.000000 | 34516.000000 | 2517.000000 | 34516.000000 | 3.397000 | 18484.000000 | 139.000000 | 3.714000 | 0.999000 |
#基于使用频率和数值特征
#取出和用户的数值型字段信息:
# df_frequency = df[["Customer_Age","Total_Trans_Ct","Total_Trans_Amt","Months_Inactive_12_mon","Credit_Limit","Attrition_Flag"]] 效果同下
df_frequency = pd.concat([df['Customer_Age'],
df['Total_Trans_Ct'],
df['Total_Trans_Amt'],
df['Months_Inactive_12_mon'],
df['Credit_Limit'],
df['Attrition_Flag']],
axis=1)
df_frequency.head()
| Customer_Age | Total_Trans_Ct | Total_Trans_Amt | Months_Inactive_12_mon | Credit_Limit | Attrition_Flag | |
|---|---|---|---|---|---|---|
| 0 | 45 | 42 | 1144 | 1 | 12691.0 | Existing Customer |
| 1 | 49 | 33 | 1291 | 1 | 8256.0 | Existing Customer |
| 2 | 51 | 20 | 1887 | 1 | 3418.0 | Existing Customer |
| 3 | 40 | 20 | 1171 | 4 | 3313.0 | Existing Customer |
| 4 | 40 | 28 | 816 | 1 | 4716.0 | Existing Customer |
#探索在不同的Attrition_Flag下,两两字段之间的关系:
df["Attrition_Flag"].value_counts()
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64
# 定义画布大小
fig, ax = plt.subplots(ncols=4, figsize=(20,6))
sns.scatterplot(data=df_frequency,
x="Total_Trans_Amt",
y="Total_Trans_Ct",
hue="Attrition_Flag",
ax=ax[0])
sns.scatterplot(data=df_frequency,
x="Months_Inactive_12_mon",
y="Total_Trans_Ct",
hue="Attrition_Flag",
ax=ax[1])
sns.scatterplot(data=df_frequency,
x="Credit_Limit",
y="Total_Trans_Ct",
hue="Attrition_Flag",
ax=ax[2])
sns.scatterplot(data=df_frequency,
x="Customer_Age",
y="Total_Trans_Ct",
hue="Attrition_Flag",
ax=ax[3])
plt.show()
#基于plotly的实现:
for col in ["Customer_Age","Total_Trans_Amt","Months_Inactive_12_mon","Credit_Limit"]:
fig = px.scatter(df_frequency,
x=col,
y="Total_Trans_Ct",
color="Attrition_Flag")
fig.show()
#上面展示的一个字段和Total_Trans_Ct的关系。下面是基于go.Scatter实现:
# 生成一个副本
df_frequency_copy = df_frequency.copy()
df_frequency_copy["Attrition_Flag_number"] = df_frequency_copy["Attrition_Flag"].apply(lambda x: 1 if x == "Existing Customer" else 2)
# 两个基本参数:设置行、列
four_columns = ["Total_Trans_Amt","Months_Inactive_12_mon","Credit_Limit","Customer_Age"]
fig = make_subplots(rows=1,
cols=4,
start_cell="top-left",
shared_yaxes=True,
subplot_titles=four_columns # 子图
)
for i, v in enumerate(four_columns):
r = i // 4 + 1 # 行
c = (i + 1) % 4 # 列-余数
if c == 0:
fig.add_trace(go.Scatter(x=df_frequency_copy[v].tolist(),
y=df_frequency_copy["Total_Trans_Ct"].tolist(),
mode='markers',
marker=dict(color=df_frequency_copy.Attrition_Flag_number)),
row=r, col=4)
else:
fig.add_trace(go.Scatter(x=df_frequency_copy[v].tolist(),
y=df_frequency_copy["Total_Trans_Ct"].tolist(),
mode='markers',
marker=dict(color=df_frequency_copy.Attrition_Flag_number)),
row=r, col=c)
fig.update_layout(width=1000, height=450, showlegend=False)
fig.show()
我们得到如下的几点结论:
图1:用户每年花费的金额越高,越可能留下来(非流失)
2-3个月不进行互动,用户流失的可能性较高
用户的信用额度越高,留下来的可能性越大
从图3中观察到:流失客户的信用卡使用次数大部分低于100次
从第4个图中观察到,用户年龄分布不是重要因素
用户的人口统计信息主要是包含:用户年龄、性别、受教育程度、状态(单身、已婚等)、收入水平等信息
#取出相关的字段进行分析:
df_demographic=df[['Customer_Age',
'Gender',
'Education_Level',
'Marital_Status',
'Income_Category',
'Attrition_Flag']]
df_demographic.head()
| Customer_Age | Gender | Education_Level | Marital_Status | Income_Category | Attrition_Flag | |
|---|---|---|---|---|---|---|
| 0 | 45 | M | High School | Married | $60K - $80K | Existing Customer |
| 1 | 49 | F | Graduate | Single | Less than $40K | Existing Customer |
| 2 | 51 | M | Graduate | Married | $80K - $120K | Existing Customer |
| 3 | 40 | F | High School | Unknown | Less than $40K | Existing Customer |
| 4 | 40 | M | Uneducated | Married | $60K - $80K | Existing Customer |
#不同类型顾客的年龄分布
px.violin(df_demographic,
y="Customer_Age",
color="Attrition_Flag")
从上面的小提琴图看出来,不同类型的用户在年龄上的分布是类似的。
结论:年龄并不是用户是否流失的关键因素
#顾客的年龄分布
fig = make_subplots(rows=2, cols=1)
trace1=go.Box(x=df['Customer_Age'],name='Age With Box Plot',boxmean=True)
trace2=go.Histogram(x=df['Customer_Age'],name='Age With Histogram')
fig.add_trace(trace1, row=1,col=1)
fig.add_trace(trace2, row=2,col=1)
fig.update_layout(height=500, width=1000, title_text="用户年龄分布")
fig.show()
#不同类型下不同性别顾客统计
flag_gender = df.groupby(["Attrition_Flag","Gender"]).size().reset_index().rename(columns={0:"number"})
flag_gender
| Attrition_Flag | Gender | number | |
|---|---|---|---|
| 0 | Attrited Customer | F | 930 |
| 1 | Attrited Customer | M | 697 |
| 2 | Existing Customer | F | 4428 |
| 3 | Existing Customer | M | 4072 |
fig = px.bar(flag_gender,
x="Attrition_Flag",
y="number",
color="Gender",
barmode="group",
text="number")
fig.show()
从上面的柱状图中看出来:
女性在本次数据中高于男性;在两种不同类型的客户中女性也是高于男性
数据不平衡:现有客户和流失客户是不平衡的,大约是8400:1600
#基于pandas中交叉表的数据统计分析。解释交叉表很好的文章:https://pbpython.com/pandas-crosstab.html
fig, (ax1,ax2,ax3,ax4) = plt.subplots(ncols=4, figsize=(20,5))
pd.crosstab(df["Attrition_Flag"],df["Gender"]).plot(kind="bar", ax=ax1, ylim=[0,5000])
pd.crosstab(df["Attrition_Flag"],df["Education_Level"]).plot(kind="bar", ax=ax2, ylim=[0,5000])
pd.crosstab(df["Attrition_Flag"],df["Marital_Status"]).plot(kind="bar", ax=ax3, ylim=[0,5000])
pd.crosstab(df["Attrition_Flag"],df["Income_Category"]).plot(kind="bar", ax=ax4, ylim=[0,5000])
fig, (ax1,ax2,ax3) = plt.subplots(ncols=3, figsize=(20,5))
pd.crosstab(df['Attrition_Flag'],df['Dependent_count']).plot(kind='bar',ax=ax1, ylim=[0,5000])
pd.crosstab(df['Attrition_Flag'],df['Card_Category']).plot(kind='bar',ax=ax2, ylim=[0,10000])
_box = sns.boxplot(data=df_demographic,x='Attrition_Flag',y='Customer_Age', ax=ax3)
plt.show()
fig = px.pie(df,names='Education_Level',title='Propotion Of Education Levels')
fig.show()
churn = df["Attrition_Flag"].value_counts()
churn
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64
churn.keys()
Index(['Existing Customer', 'Attrited Customer'], dtype='object')
plt.pie(x=churn, labels=churn.keys(),autopct="%.1f%%")
plt.show()
现有客户还是占据了绝大部分 后面将通过采样的方式使得两种类型的客户数量保持平衡。
现有数据中的字段涉及到分类型和数值型,采取不同的分析和编码方式
数值型变量:使用相关系数Pearson
分类型变量:使用Cramer’s V ;克莱姆相关系数,常用于分析双变量之间的关系
参考内容:https://blog.csdn.net/deecheanW/article/details/120474864
# 字符型字段
# 相同效果:df.select_dtypes(include="O")
df_categorical=df.loc[:,df.dtypes==np.object_]
df_categorical.head()
# 数值型字段
df_number = df.select_dtypes(exclude="O")
df_number.head()
| Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 45 | 3 | 39 | 5 | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 49 | 5 | 44 | 6 | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 51 | 3 | 36 | 4 | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 40 | 4 | 34 | 3 | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 40 | 3 | 21 | 5 | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 |
对Attrition_Flag字段执行独热码编码操作:
# 先保留原信息
df_number["Attrition_Flag"] = df.loc[:, "Attrition_Flag"]
from sklearn import preprocessing
label = preprocessing.LabelEncoder()
df_categorical_encoded = pd.DataFrame()
# 对分类型的字段进行类型编码
for i in df_categorical.columns:
df_categorical_encoded[i] = label.fit_transform(df_categorical[i])
### 计算克莱姆系数-cramers_V
from scipy.stats import chi2_contingency
# 定义计算克莱姆系数的函数
def cal_cramers_v(v1,v2):
crosstab = np.array(pd.crosstab(v1,v2,rownames=None,colnames=None))
stat = chi2_contingency(crosstab)[0]
obs = np.sum(crosstab)
mini = min(crosstab.shape) - 1
return stat / (obs * mini)
rows = []
for v1 in df_categorical_encoded:
col = []
for v2 in df_categorical_encoded:
# 计算克莱姆系数
cramers = cal_cramers_v(df_categorical_encoded[v1],df_categorical_encoded[v2])
col.append(round(cramers, 2))
rows.append(col)
# 克莱姆系数下的热力图
cramers_results = np.array(rows)
cramerv_matrix = pd.DataFrame(cramers_results,
columns=df_categorical_encoded.columns,
index=df_categorical_encoded.columns)
cramerv_matrix.head()
| Attrition_Flag | Gender | Education_Level | Marital_Status | Income_Category | Card_Category | |
|---|---|---|---|---|---|---|
| Attrition_Flag | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 |
| Gender | 0.0 | 1.0 | 0.0 | 0.0 | 0.7 | 0.01 |
| Education_Level | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.00 |
| Marital_Status | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.00 |
| Income_Category | 0.0 | 0.7 | 0.0 | 0.0 | 1.0 | 0.00 |
#绘制相关的热力图:
mask = np.triu(np.ones_like(cramerv_matrix, dtype=np.bool_))
cat_heatmap = sns.heatmap(cramerv_matrix, # 系数矩阵
mask=mask,
vmin=-1,
vmax=1,
annot=True,
cmap="BrBG")
cat_heatmap.set_title("Heatmap of Correlation(Categorical)", fontdict={"fontsize": 14}, pad=12)
plt.show()
df_number['Existing Customer']=df_number['Attrition_Flag'].apply(lambda x: 1 if x=='Existing Customer' else 0)
df_number['Attrited Customer']=df_number['Attrition_Flag'].apply(lambda x: 0 if x=='Existing Customer' else 1)
# 基于数值型字段的相关系数
from scipy import stats
num_corr = df_number.corr() # 相关系数
plt.figure(figsize = (16,6))
mask = np.triu(np.ones_like(num_corr, dtype=np.bool_))
heatmap_number = sns.heatmap(num_corr, mask=mask,
vmin=-1, vmax=1,
annot=True, cmap="RdYlBu")
heatmap_number.set_title("Heatmap of Correlation(Number)", fontdict={"fontsize": 14}, pad=12)
plt.show()
fig, ax = plt.subplots(ncols=2, figsize=(15,6))
heatmap = sns.heatmap(num_corr[["Existing Customer"]].sort_values(by="Existing Customer", ascending=False),
ax=ax[0],
vmin=-1,
vmax=1,
annot=True,
cmap="coolwarm_r")
heatmap.set_title("Features Correlating with Existing Customers",fontdict={"fontsize":18}, pad=16);
heatmap = sns.heatmap(num_corr[["Attrited Customer"]].sort_values(by="Attrited Customer", ascending=False),
ax=ax[1],
vmin=-1,
vmax=1,
annot=True,
cmap="coolwarm_r")
heatmap.set_title("Features Correlating with Attrited Customers",fontdict={"fontsize":18}, pad=16);
fig.tight_layout(pad=5)
plt.show()
小结:从上面右侧的热力图中能看到下面的字段和流失类型客户是无相关的。相关系数的值在正负0.1之间(右图)
Credit Limit
Average Open To Buy
Months On Book
Age
Dependent Count
现在我们考虑将上面的字段进行删除:
df_model = df.copy()
df_model = df_model.drop(['Credit_Limit','Customer_Age','Avg_Open_To_Buy','Months_on_book','Dependent_count'],axis=1)
df_model['Attrition_Flag'] = df_model['Attrition_Flag'].map({'Existing Customer': 1, 'Attrited Customer': 0})
#剩余字段的独热码:
df_model=pd.get_dummies(df_model)
切分数据
在之前已经验证过现有客户和流失客户的数量是不均衡的,我们使用SMOTE(Synthetic Minority Oversampling Technique,通过上采样合成少量的数据)采样来平衡数据。
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
# 特征和目标变量
# X = df_model.drop("Attrition_Flag", axis=1, inplace=True)
X = df_model.loc[:, df_model.columns != "Attrition_Flag"]
y = df_model["Attrition_Flag"]
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
sm = SMOTE(sampling_strategy="minority", k_neighbors=20, random_state=42)
# 实施采样过程
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
# 1、随机森林
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_res, y_train_res)
RandomForestClassifier()
# 2、支持向量机
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
# 使用支持向量机数据需要归一化
svm = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svm.fit(X_train_res, y_train_res)
Pipeline(steps=[('standardscaler', StandardScaler()),
('svc', SVC(gamma='auto'))])
Pipeline(steps=[('standardscaler', StandardScaler()),
('svc', SVC(gamma='auto'))])
Pipeline(steps=[('standardscaler', StandardScaler()),
('svc', SVC(gamma='auto'))])
# 3、提升树
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=100, # tree的个数
learning_rate=1.0, # 学习率
max_depth=1, # 叶子的最大深度
random_state=42)
gb.fit(X_train_res, y_train_res)
GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=42)
y_rf = rf.predict(X_test)
y_svm = svm.predict(X_test)
y_gb = gb.predict(X_test)
from sklearn.metrics import ConfusionMatrixDisplay
fig,ax=plt.subplots(ncols=3, figsize=(20,6))
ConfusionMatrixDisplay.from_estimator(rf, X_test, y_test, ax=ax[0])
#x[0].title.set_text('RF')
ConfusionMatrixDisplay.from_estimator(svm, X_test, y_test, ax=ax[1])
ax[1].title.set_text('SVM')
ConfusionMatrixDisplay.from_estimator(gb, X_test, y_test, ax=ax[2])
ax[2].title.set_text('GB')
fig.tight_layout(pad=5)
plt.show()
# classification_report, recall_score, precision_score, f1_score
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score
print('Random Forest Classifier')
print(classification_report(y_test, y_rf))
print('------------------------')
print('Support Vector Machine')
print(classification_report(y_test, y_svm))
print('------------------------')
print('Gradient Boosting')
print(classification_report(y_test, y_gb))
Random Forest Classifier
precision recall f1-score support
0 0.84 0.84 0.84 541
1 0.97 0.97 0.97 2801
accuracy 0.95 3342
macro avg 0.91 0.90 0.90 3342
weighted avg 0.95 0.95 0.95 3342
------------------------
Support Vector Machine
precision recall f1-score support
0 0.81 0.55 0.66 541
1 0.92 0.98 0.95 2801
accuracy 0.91 3342
macro avg 0.87 0.76 0.80 3342
weighted avg 0.90 0.91 0.90 3342
------------------------
Gradient Boosting
precision recall f1-score support
0 0.83 0.84 0.84 541
1 0.97 0.97 0.97 2801
accuracy 0.95 3342
macro avg 0.90 0.90 0.90 3342
weighted avg 0.95 0.95 0.95 3342
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# n_estimators # 随机森林中树的个数
max_features = ['auto', 'sqrt']
# 每个tree的最大叶子数
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
max_depth
[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
rf_random = RandomizedSearchCV(
estimator = rf, # rf模型
param_distributions=random_grid, # 搜索参数
n_iter=30,
cv=3,
verbose=2,
random_state=42,
n_jobs=-1)
rf_random.fit(X_train_res, y_train_res)
print(rf_random.best_params_)
Fitting 3 folds for each of 30 candidates, totalling 90 fits
{'n_estimators': 2000, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 20, 'bootstrap': True}
使用搜索参数建模 使用上面搜索之后的参数再次建模:
rf_clf_search= RandomForestClassifier(n_estimators=1400,
min_samples_split=2,
min_samples_leaf=1,
max_features='auto',
max_depth=110,
bootstrap=True)
rf_clf_search.fit(X_train_res,y_train_res)
y_rf_opt=rf_clf_search.predict(X_test)
print('Random Forest Classifier (Optimized)')
print(classification_report(y_test, y_rf_opt))
_rf_opt=ConfusionMatrixDisplay.from_estimator(rf_clf_search, X_test, y_test)
Random Forest Classifier (Optimized)
precision recall f1-score support
0 0.86 0.84 0.85 541
1 0.97 0.97 0.97 2801
accuracy 0.95 3342
macro avg 0.91 0.91 0.91 3342
weighted avg 0.95 0.95 0.95 3342
网格搜索参数
from sklearn.model_selection import GridSearchCV
param_test1 = {'n_estimators':range(20,100,10)}
param_test1
{'n_estimators': range(20, 100, 10)}
# 实施搜索
grid_search1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=1.0, # 待搜索模型
min_samples_split=500,
min_samples_leaf=50,
max_depth=8,
max_features='sqrt',
subsample=0.8,
random_state=10),
param_grid = param_test1, # 搜索参数
scoring='roc_auc',
n_jobs=4,
cv=5)
grid_search1.fit(X_train_res,y_train_res)
grid_search1.best_params_
{'n_estimators': 90}
gb_clf_opt=GradientBoostingClassifier(n_estimators=90, # 搜索到的参数90
learning_rate=1.0,
min_samples_split=500,
min_samples_leaf=50,
max_depth=8,
max_features='sqrt',
subsample=0.8,
random_state=10)
# 再次拟合
gb_clf_opt.fit(X_train_res,y_train_res)
y_gb_opt=gb_clf_opt.predict(X_test)
print('Gradient Boosting (Optimized)')
print(classification_report(y_test, y_gb_opt))
print(recall_score(y_test,y_gb_opt,pos_label=0))
_gbopt=ConfusionMatrixDisplay.from_estimator(gb_clf_opt, X_test, y_test)
_gbopt
Gradient Boosting (Optimized)
precision recall f1-score support
0 0.85 0.84 0.85 541
1 0.97 0.97 0.97 2801
accuracy 0.95 3342
macro avg 0.91 0.91 0.91 3342
weighted avg 0.95 0.95 0.95 3342
0.8428835489833642
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1fb003a9dc0>